import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
plt.style.use('seaborn')
sns.set(font_scale=2.5) # 이 두줄은 본 필자가 항상 쓰는 방법입니다. matplotlib 의 기본 scheme 말고 seaborn scheme 을 세팅하고, 일일이 graph 의 font size 를 지정할 필요 없이 seaborn 의 font_scale 을 사용하면 편합니다.
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Preoperative 데이터
data = pd.read_csv('train.csv')
data.head()
| Patient Initials | Age | Menopause | Unnamed: 3 | Method | Histologic type | Grade | Unnamed: 7 | CA125 (IU/ml) | Myometrial invasion depth | Tumor size (largest diameter) | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LMQI | 54 | Yes | NaN | Dilatation/currettage | Endometrioid | I | NaN | 14.94 | Less than 50% (< 1/2) | 3 |
| 1 | ZJYU | 55 | Yes | NaN | Dilatation/currettage | Endometrioid | I | NaN | 20.38 | None | 1.7 |
| 2 | YGFA | 45 | No | NaN | Dilatation/currettage | Endometrioid | I | NaN | 43.47 | Less than 50% (< 1/2) | 1.25 |
| 3 | YYHI | 62 | Yes | NaN | Dilatation/currettage | Endometrioid | I | NaN | 13.83 | Less than 50% (< 1/2) | 4.7 |
| 4 | CJQI | 55 | Yes | NaN | Dilatation/currettage | Endometrioid | I | NaN | 9.02 | Less than 50% (< 1/2) | 1.3 |
# 이상한 column 제거
data.drop(['Unnamed: 3','Unnamed: 7','Patient Initials', 'Histologic type'], axis=1, inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 252 entries, 0 to 251 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 252 non-null int64 1 Menopause 252 non-null object 2 Method 252 non-null object 3 Grade 252 non-null object 4 CA125 (IU/ml) 251 non-null float64 5 Myometrial invasion depth 252 non-null object 6 Tumor size (largest diameter) 252 non-null object dtypes: float64(1), int64(1), object(5) memory usage: 13.9+ KB
# PostOperative 데이터
label_data = pd.read_csv('label.csv')
label_data.head()
label_data.drop(['Unnamed: 7'], axis=1, inplace=True)
label_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 252 entries, 0 to 251 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Route 252 non-null object 1 Stage 252 non-null object 2 Histologic diagnosis 252 non-null object 3 Grade 252 non-null object 4 Myometrial invasion depth 252 non-null object 5 Tumor size (largest diameter) 252 non-null float64 6 Extrauterine involvement 252 non-null object 7 Lymphovascular space invasion (LVSI) 252 non-null object 8 Metastasis of pelvic lymph node 252 non-null object 9 Metastasis of para-aortic lymph node 252 non-null int64 10 LN metastasis 252 non-null object dtypes: float64(1), int64(1), object(9) memory usage: 21.8+ KB
label_data['Grade'].unique()
array(['I', 'II', 'III', 'Inadequate for interpretation',
' '],
dtype=object)
label_data['Grade'] = label_data['Grade'].replace('III', 'others')
label_data['Grade'] = label_data['Grade'].replace('Inadequate for interpretation', 'others')
label_data['Grade'] = label_data['Grade'].replace(' ', 'others')
label_Grade = label_data['Grade']
label_Grade.unique()
array(['I', 'II', 'others'], dtype=object)
label_data['Stage'].unique()
array(['Ia', 'II', 'IIIc', 'Ib', 'IIIc1'], dtype=object)
label_data['Stage'] = label_data['Stage'].replace('II', 'Ia')
label_data['Stage'] = label_data['Stage'].replace('IIIc', 'Ia')
label_data['Stage'] = label_data['Stage'].replace('IIIc1', 'Ia')
label_Stage = label_data['Stage']
label_Stage.unique()
array(['Ia', 'Ib'], dtype=object)
암수술전에 시행한 자궁내막조직검사 및 MRI에 따라
Group 1 (no myometrial invasion + grade 1),
Group 2 (no myometrial invasion + grade 2),
Group 3 (Myometrial invasion<50% + grade 1),
Group 4 (Myometrial invasion<50% + grade 2)로
분류한 후 각각의 group에 속한 환자들이 암수술 후에도 여전히 같은 group에 있을 가능성을 평가함(accuracy, NPV, PPV, sensitivity, specificity, AUC, Kappa) => total population에서도 시행함
Group_A = ((label_data['Grade']== 'I') & (label_data['Myometrial invasion depth']=='None'))
Group_A.value_counts()
False 184 True 68 dtype: int64
Group_B = ((label_data['Grade']== 'II') & (label_data['Myometrial invasion depth']=='None'))
Group_B.value_counts()
False 236 True 16 dtype: int64
Group_C = ((label_data['Grade']== 'I') & (label_data['Myometrial invasion depth']=='Less than 50%'))
Group_C.value_counts()
False 150 True 102 dtype: int64
Group_D = ((label_data['Grade']== 'II') & (label_data['Myometrial invasion depth']=='Less than 50%'))
Group_D.value_counts()
False 210 True 42 dtype: int64
Group A: 68
Group B: 16
Group C: 102
Group D: 42
전체 환자 수 : 252명, Group A~D: 228, 나머지: 24(Grade='others' 이거나 침습깊이가 More than 50%일 때)
data['Myometrial invasion depth'].unique()
array(['Less than 50% (< 1/2)', 'None'], dtype=object)
label_data['Myometrial invasion depth'].unique()
array(['Less than 50%', 'None', 'More than 50%'], dtype=object)
none = ((data['Myometrial invasion depth'] == 'None') & (label_data['Myometrial invasion depth'] == 'None'))
none.value_counts()
# None -> None: 71
False 181 True 71 Name: Myometrial invasion depth, dtype: int64
lnone = ((data['Myometrial invasion depth'] == 'None') & (label_data['Myometrial invasion depth'] == 'Less than 50%'))
lnone.value_counts()
# None -> Less than 50%: 72
False 180 True 72 Name: Myometrial invasion depth, dtype: int64
mnone = ((data['Myometrial invasion depth'] == 'None') & (label_data['Myometrial invasion depth'] == 'More than 50%'))
mnone.value_counts()
# None -> More than 50%: 4
False 248 True 4 Name: Myometrial invasion depth, dtype: int64
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
# Confusion matrix 확인.
plt.rc('axes', labelsize=15) # x,y축 label 폰트 크기
plt.rc('xtick', labelsize=11) # x축 눈금 폰트 크기
plt.rc('ytick', labelsize=11) # y축 눈금 폰트 크기
cf_matrix = confusion_matrix(data['Myometrial invasion depth'], label_data['Myometrial invasion depth'])
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_xlabel('Postoperative')
ax.set_ylabel('Preoperative')
ax.xaxis.set_ticklabels(['Less than 50%', '?', 'More than 50%', 'None'])
ax.yaxis.set_ticklabels(['?', 'Less than 50%', '?', 'None'])
plt.show()
Preoperative -> Postoperative
None -> More than 50%: 4 (1.58%)
Less than 50% -> None: 20 (7.93%)
data['Grade'].unique()
array(['I', 'II'], dtype=object)
label_data['Grade'].unique()
array(['I', 'II', 'others'], dtype=object)
none = ((data['Grade'] == 'I') & (label_data['Grade'] == 'I'))
none.value_counts()
# I -> I: 71
True 152 False 100 Name: Grade, dtype: int64
none = ((data['Grade'] == 'I') & (label_data['Grade'] == 'II'))
none.value_counts()
# I -> II: 71
False 228 True 24 Name: Grade, dtype: int64
none = ((data['Grade'] == 'I') & (label_data['Grade'] == 'others'))
none.value_counts()
# I -> II: 71
False 247 True 5 Name: Grade, dtype: int64
# Confusion matrix 확인.
plt.rc('axes', labelsize=20) # x,y축 label 폰트 크기
plt.rc('xtick', labelsize=15) # x축 눈금 폰트 크기
plt.rc('ytick', labelsize=15) # y축 눈금 폰트 크기
cf_matrix = confusion_matrix(data['Grade'], label_data['Grade'])
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_xlabel('Postoperative')
ax.set_ylabel('Preoperative')
ax.xaxis.set_ticklabels(['I', 'II', 'others'])
ax.yaxis.set_ticklabels(['I', 'II', '?'])
plt.show()
Preoperative -> Postoperative
I -> others: 5 (1.98%)
II -> I: 25 (9.92%)
'll': 일반적인 세포로 보이지 않으면서 일반 세포보다 빠르게 자라는 암세포
Grade 또한 정확하게 예측하는 것이 좋겠지만, II단계까지 호르몬 치료를 하려고 Group을 나눴기 때문에 II 이하까지는 괜찮다고 가정한다.
cf_matrix = confusion_matrix(data['Grade'], label_data['Grade'])
cf_matrix
array([[152, 24, 5],
[ 25, 40, 6],
[ 0, 0, 0]], dtype=int64)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 252 entries, 0 to 251 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 252 non-null int64 1 Menopause 252 non-null object 2 Method 252 non-null object 3 Grade 252 non-null object 4 CA125 (IU/ml) 251 non-null float64 5 Myometrial invasion depth 252 non-null object 6 Tumor size (largest diameter) 252 non-null object dtypes: float64(1), int64(1), object(5) memory usage: 13.9+ KB
data['Tumor size (largest diameter)'].unique()
array(['3', '1.7', '1.25', '4.7', '1.3', '2.3', '5.5', '0.6', '2', '.',
'3.7', '4.2', '1.5', '2.9', '2.2', '3.3', '5', '2.8', '0', '2.5',
'1.8', '5.8', '0.1', '4', '1', '2.6', '7.58', '3.4', '2.1', '3.5',
'1.2', '1.1', '4.5', '3.6', '0.9', '5.7', '3.1', '2.7', '0.8',
'2.4', '4.1', '1.6', '1.9', '8.3', '3.2', '4.4', '1.45', '17',
'3.9', '0.5'], dtype=object)
# '.' 형태를 0으로 변환.
data['Tumor size (largest diameter)'].replace('.', 0, inplace=True)
data['Tumor size (largest diameter)'] = data['Tumor size (largest diameter)'].astype('float')
cat_columns = [c for c, t in zip(data.dtypes.index, data.dtypes) if t=='O']
num_columns = [c for c in data.columns if c not in cat_columns]
print('Categorical columns: \n {}\n\n Numeric columns: \n{}\n'.format(cat_columns, num_columns))
Categorical columns: ['Menopause', 'Method', 'Grade', 'Myometrial invasion depth'] Numeric columns: ['Age', 'CA125 (IU/ml)', 'Tumor size (largest diameter)']
Categorical = data.select_dtypes(include=['object'])
Numerical = data.select_dtypes(include=['int64', 'float64'])
print('Categorical feature:\n', Categorical)
print('Numerical features:\n', Numerical)
Categorical feature:
Menopause Method Grade Myometrial invasion depth
0 Yes Dilatation/currettage I Less than 50% (< 1/2)
1 Yes Dilatation/currettage I None
2 No Dilatation/currettage I Less than 50% (< 1/2)
3 Yes Dilatation/currettage I Less than 50% (< 1/2)
4 Yes Dilatation/currettage I Less than 50% (< 1/2)
.. ... ... ... ...
247 No Pipelle biopsy I Less than 50% (< 1/2)
248 No Pipelle biopsy I Less than 50% (< 1/2)
249 No Pipelle biopsy II None
250 Yes Pipelle biopsy II Less than 50% (< 1/2)
251 No Pipelle biopsy II Less than 50% (< 1/2)
[252 rows x 4 columns]
Numerical features:
Age CA125 (IU/ml) Tumor size (largest diameter)
0 54 14.94 3.00
1 55 20.38 1.70
2 45 43.47 1.25
3 62 13.83 4.70
4 55 9.02 1.30
.. ... ... ...
247 46 16.40 3.00
248 52 14.60 2.90
249 50 28.40 0.00
250 62 14.10 1.00
251 51 39.21 1.50
[252 rows x 3 columns]
f, ax = plt.subplots(1, 2, figsize=(18, 8))
label_data['Grade'].value_counts().plot.pie(explode=[0, 0.1, 0.2], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Grade')
ax[0].set_ylabel('')
sns.countplot('Grade', data=label_data, ax=ax[1])
ax[1].set_title('Countplot - Grade')
plt.show()
(label_data['Grade']=='others').value_counts()
False 241 True 11 Name: Grade, dtype: int64
sns.heatmap(data.corr(), annot=True)
plt.show()
sns.distplot(data['CA125 (IU/ml)'][label_data['Grade']=='I'], kde=True, color='red', label='Grade I')
sns.distplot(data['CA125 (IU/ml)'][label_data['Grade']=='II'], kde=True, color='green', label='Grade II')
# sns.distplot(data['CA125 (IU/ml)'][label_data['Grade']=='others'], kde=True, color='blue', label='others')
plt.legend()
<matplotlib.legend.Legend at 0x23eef3d5820>
data['CA125 (IU/ml)'] = data['CA125 (IU/ml)'].map(lambda i: np.log(i) if i>0 else 0)
sns.distplot(data['CA125 (IU/ml)'][label_data['Grade']=='I'], kde=True, color='red', label='Grade I')
sns.distplot(data['CA125 (IU/ml)'][label_data['Grade']=='II'], kde=True, color='green', label='Grade II')
# sns.distplot(data['CA125 (IU/ml)'][label_data['Grade']=='others'], kde=True, color='blue', label='others')
plt.legend()
<matplotlib.legend.Legend at 0x23eef4de040>
sns.distplot(data['Tumor size (largest diameter)'][label_data['Grade']=='I'], kde=True, color='red', label='Grade I')
sns.distplot(data['Tumor size (largest diameter)'][label_data['Grade']=='II'], kde=True, color='green', label='Grade II')
# sns.distplot(data['Tumor size (largest diameter)'][label_data['Grade']=='others'], kde=True, color='blue', label='others')
plt.legend()
<matplotlib.legend.Legend at 0x23eef31b310>
data['Tumor size (largest diameter)'] = data['CA125 (IU/ml)'].map(lambda i: np.log(i) if i>0 else 0)
sns.distplot(data['Tumor size (largest diameter)'][label_data['Grade']=='I'], kde=True, color='red', label='Grade I')
sns.distplot(data['Tumor size (largest diameter)'][label_data['Grade']=='II'], kde=True, color='green', label='Grade II')
# sns.distplot(data['Tumor size (largest diameter)'][label_data['Grade']=='others'], kde=True, color='blue', label='others')
plt.legend()
<matplotlib.legend.Legend at 0x23eef331ca0>
for i in Numerical:
plt.figure(figsize=(10, 5))
sns.countplot(x=i, data=data, hue=label_data['Grade'])
plt.legend(['I', 'II', 'others'])
plt.title(i)
plt.show()
for i in Categorical:
plt.figure(figsize=(10, 5))
sns.countplot(x=i, data=data, hue=label_data['Grade'])
plt.legend(['I', 'II', 'others'])
plt.title(i)
plt.show()
import plotly.express as px
fig = px.violin(data, y='Age', x=label_Grade, color='Menopause', box=True, points="all")
fig.show()
import plotly.express as px
fig = px.violin(data, y='CA125 (IU/ml)', x=label_Grade, color='Menopause', box=True, points="all")
fig.show()
import plotly.express as px
fig = px.violin(data, y='Tumor size (largest diameter)', x=label_Grade, color='Menopause', box=True, points="all")
fig.show()
# pd.isna(data).sum()
# data.describe()
# # 중간 값인 21로 우선 처리.
# data = data.fillna(21)
# pd.isna(data).sum()
# 일단은 수치형에 LabelEncoder. (나중에 따로 수치형은 정규화 해보기. - 수치형만 StadnardScaler를 해도 결과는 똑같음)
# select numerical features and encoding it
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# select numerical features
numerical_features = data.select_dtypes(include=['int64', 'float64'])# apply label encoding
# apply label encoding
numerical_features = numerical_features.apply(LabelEncoder().fit_transform)
numerical_features.head()
| Age | CA125 (IU/ml) | Tumor size (largest diameter) | |
|---|---|---|---|
| 0 | 24 | 95 | 95 |
| 1 | 25 | 140 | 140 |
| 2 | 15 | 193 | 193 |
| 3 | 32 | 87 | 87 |
| 4 | 25 | 44 | 44 |
# # 수치형 변수 스케일링
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# # select numerical features
# data[num_columns] = scaler.fit_transform(data[num_columns])
# data[num_columns]
# One-Hot encoding the categorical features using get_dummies()
# select categorical features
categorical_features = data.select_dtypes(include=['object'])
# apply get_dummies encoding
#categorical_features = pd.get_dummies(categorical_features)
categorical_features.head()
| Menopause | Method | Grade | Myometrial invasion depth | |
|---|---|---|---|---|
| 0 | Yes | Dilatation/currettage | I | Less than 50% (< 1/2) |
| 1 | Yes | Dilatation/currettage | I | None |
| 2 | No | Dilatation/currettage | I | Less than 50% (< 1/2) |
| 3 | Yes | Dilatation/currettage | I | Less than 50% (< 1/2) |
| 4 | Yes | Dilatation/currettage | I | Less than 50% (< 1/2) |
categorical_features['Grade'] = categorical_features['Grade'].replace('I', 0)
categorical_features['Grade'] = categorical_features['Grade'].replace('II', 1)
categorical_features = pd.get_dummies(categorical_features)
categorical_features
| Grade | Menopause_No | Menopause_Yes | Method_Dilatation/currettage | Method_Hysteroscopy | Method_Pipelle biopsy | Myometrial invasion depth_Less than 50% (< 1/2) | Myometrial invasion depth_None | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 2 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| 4 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 247 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
| 248 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
| 249 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 250 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
| 251 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
252 rows × 8 columns
combined = pd.concat([data[num_columns], categorical_features], axis=1)
combined.head()
| Age | CA125 (IU/ml) | Tumor size (largest diameter) | Grade | Menopause_No | Menopause_Yes | Method_Dilatation/currettage | Method_Hysteroscopy | Method_Pipelle biopsy | Myometrial invasion depth_Less than 50% (< 1/2) | Myometrial invasion depth_None | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 54 | 2.704042 | 0.994748 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| 1 | 55 | 3.014554 | 1.103452 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 2 | 45 | 3.772071 | 1.327624 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3 | 62 | 2.626840 | 0.965782 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| 4 | 55 | 2.199444 | 0.788205 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
# seperate features and target
X = combined
y = label_data['Grade']
from sklearn.preprocessing import LabelEncoder
# 라벨 y에 LabelEncoder 적용
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 0,
1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 1, 0, 1,
0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 2, 1,
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
0, 0, 0, 2, 1, 2, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
0, 1, 0, 0, 0, 0, 0, 1, 1, 1])
# ValueError: feature_names may not contain [, ] or < xgboost 에러 해결
# [, ] or <가 포함되지 않아야 한다고함.
import re
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBRegressor
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]
#!pip install imbalanced-learn==0.6.0
#!pip install scikit-learn==0.22.1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# imbalanced-learn 패키지
#from imblearn.over_sampling import SMOTE
# 검증 데이터나 테스트 데이터가 아닌 학습데이터에서만 오버샘플링 사용할 것
#smote = SMOTE(random_state=11)
#X_train_over, y_train_over = smote.fit_sample(X_train, y_train)
# from imblearn.over_sampling import RandomOverSampler
# ros = RandomOverSampler(random_state=42)
# X_train, y_train = ros.fit_resample(X_train, y_train)
# model building xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
# XGBoost 분류기 생성
xgb_clf = XGBClassifier(n_estimators=10)
# define grid
weights = [1, 10, 25, 50, 75, 99, 100, 1000]
# 초모수 격자 생성
xgb_param_grid = {'max_depth': list(range(2, 10)),
'subsample': np.linspace(0.4, 1, 7),
'scale_pos_weight': weights,}
# create a random search object
xgb_random = RandomizedSearchCV(estimator = xgb_clf,
param_distributions = xgb_param_grid,
n_iter = 20, # 파라미터 검색 횟수
scoring = 'f1_macro', # 평가 지표
n_jobs=-1, # 사용할 CPU 코어 개수(1: 기본값, -1 모든 코어 다 사용)
cv=3, # 교차 검증시 fold 개수
random_state = 42,
refit=True,
return_train_score=True)
xgb_random.fit(X_train, y_train)
# predict
pred_train = xgb_random.predict(X_train)
y_pred = xgb_random.predict(X_test)
# accuracy
from sklearn.metrics import accuracy_score
print('train Accuracy:', accuracy_score(y_train, pred_train))
print('test Accuracy:', accuracy_score(y_test, y_pred))
print('Macro average of F1 score', f1_score(y_test, y_pred, average='macro'))
train Accuracy: 0.7810945273631841 test Accuracy: 0.9019607843137255 Macro average of F1 score 0.5803921568627451
hr_random_df = pd.DataFrame(xgb_random.cv_results_)
hr_random_df.loc[:, ['mean_test_score', "params"]]
| mean_test_score | params | |
|---|---|---|
| 0 | 0.374972 | {'subsample': 0.8999999999999999, 'scale_pos_w... |
| 1 | 0.433927 | {'subsample': 0.6, 'scale_pos_weight': 25, 'ma... |
| 2 | 0.394253 | {'subsample': 0.8999999999999999, 'scale_pos_w... |
| 3 | 0.428530 | {'subsample': 0.7, 'scale_pos_weight': 10, 'ma... |
| 4 | 0.367225 | {'subsample': 0.4, 'scale_pos_weight': 25, 'ma... |
| 5 | 0.427544 | {'subsample': 0.6, 'scale_pos_weight': 75, 'ma... |
| 6 | 0.412865 | {'subsample': 0.7, 'scale_pos_weight': 50, 'ma... |
| 7 | 0.438880 | {'subsample': 0.6, 'scale_pos_weight': 50, 'ma... |
| 8 | 0.371262 | {'subsample': 0.4, 'scale_pos_weight': 25, 'ma... |
| 9 | 0.364247 | {'subsample': 1.0, 'scale_pos_weight': 25, 'ma... |
| 10 | 0.426253 | {'subsample': 1.0, 'scale_pos_weight': 1000, '... |
| 11 | 0.373140 | {'subsample': 0.5, 'scale_pos_weight': 99, 'ma... |
| 12 | 0.372506 | {'subsample': 0.8999999999999999, 'scale_pos_w... |
| 13 | 0.413371 | {'subsample': 0.8, 'scale_pos_weight': 75, 'ma... |
| 14 | 0.420317 | {'subsample': 0.6, 'scale_pos_weight': 1, 'max... |
| 15 | 0.364247 | {'subsample': 1.0, 'scale_pos_weight': 10, 'ma... |
| 16 | 0.370717 | {'subsample': 0.8999999999999999, 'scale_pos_w... |
| 17 | 0.377296 | {'subsample': 0.4, 'scale_pos_weight': 50, 'ma... |
| 18 | 0.425763 | {'subsample': 0.7, 'scale_pos_weight': 50, 'ma... |
| 19 | 0.367225 | {'subsample': 0.4, 'scale_pos_weight': 50, 'ma... |
hr_random_df.loc[:, ['mean_train_score', 'mean_test_score', "params"]]
| mean_train_score | mean_test_score | params | |
|---|---|---|---|
| 0 | 0.735178 | 0.374972 | {'subsample': 0.8999999999999999, 'scale_pos_w... |
| 1 | 0.601681 | 0.433927 | {'subsample': 0.6, 'scale_pos_weight': 25, 'ma... |
| 2 | 0.680779 | 0.394253 | {'subsample': 0.8999999999999999, 'scale_pos_w... |
| 3 | 0.638688 | 0.428530 | {'subsample': 0.7, 'scale_pos_weight': 10, 'ma... |
| 4 | 0.459952 | 0.367225 | {'subsample': 0.4, 'scale_pos_weight': 25, 'ma... |
| 5 | 0.511403 | 0.427544 | {'subsample': 0.6, 'scale_pos_weight': 75, 'ma... |
| 6 | 0.632563 | 0.412865 | {'subsample': 0.7, 'scale_pos_weight': 50, 'ma... |
| 7 | 0.549193 | 0.438880 | {'subsample': 0.6, 'scale_pos_weight': 50, 'ma... |
| 8 | 0.476599 | 0.371262 | {'subsample': 0.4, 'scale_pos_weight': 25, 'ma... |
| 9 | 0.784663 | 0.364247 | {'subsample': 1.0, 'scale_pos_weight': 25, 'ma... |
| 10 | 0.490438 | 0.426253 | {'subsample': 1.0, 'scale_pos_weight': 1000, '... |
| 11 | 0.554795 | 0.373140 | {'subsample': 0.5, 'scale_pos_weight': 99, 'ma... |
| 12 | 0.738089 | 0.372506 | {'subsample': 0.8999999999999999, 'scale_pos_w... |
| 13 | 0.731330 | 0.413371 | {'subsample': 0.8, 'scale_pos_weight': 75, 'ma... |
| 14 | 0.599813 | 0.420317 | {'subsample': 0.6, 'scale_pos_weight': 1, 'max... |
| 15 | 0.784663 | 0.364247 | {'subsample': 1.0, 'scale_pos_weight': 10, 'ma... |
| 16 | 0.622624 | 0.370717 | {'subsample': 0.8999999999999999, 'scale_pos_w... |
| 17 | 0.478082 | 0.377296 | {'subsample': 0.4, 'scale_pos_weight': 50, 'ma... |
| 18 | 0.638688 | 0.425763 | {'subsample': 0.7, 'scale_pos_weight': 50, 'ma... |
| 19 | 0.459952 | 0.367225 | {'subsample': 0.4, 'scale_pos_weight': 50, 'ma... |
xgb_random.best_estimator_
XGBClassifier(n_estimators=10, objective='multi:softprob', scale_pos_weight=50,
subsample=0.6)
# # Finding the best parameters using loop
# accuracy = []
# for i in range(10, 100):
# xgb_random = XGBClassifier(n_estimators=i)
# xgb_random.fit(X_train_over, y_train_over)
# y_pred = xgb_random.predict(X_test)
# accuracy.append(accuracy_score(y_test, y_pred))
# # ploting accuracy graph
# plt.plot(range(10, 100), accuracy)
# plt.ylabel('Accuracy')
# plt.xlabel('Range')
# plt.show()
# print precision, recall, f1 score
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.89 1.00 0.94 40
1 1.00 0.67 0.80 9
2 0.00 0.00 0.00 2
accuracy 0.90 51
macro avg 0.63 0.56 0.58 51
weighted avg 0.87 0.90 0.88 51
# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 5))
ax = sns.heatmap(cm, annot=True)
ax.set_xlabel('Predicted label')
ax.set_ylabel('True label')
ax.xaxis.set_ticklabels(['I', 'II', 'others'])
ax.yaxis.set_ticklabels(['I', 'II', 'others'])
# 세로 축이 True label
# 가로 축이 Predicted label
[Text(0, 0.5, 'I'), Text(0, 1.5, 'II'), Text(0, 2.5, 'others')]
# Feature importance for xgboost
feat_importances = pd.Series(xgb_random.best_estimator_.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.xlabel('Relative Importance')
plt.ylabel('Features')
plt.title('Feature Importances')
plt.show()
import shap
model = xgb_random.best_estimator_
shap.initjs()
# Create the explainer object
explainer = shap.TreeExplainer(model)
print('Expected Value:', explainer.expected_value)
# get the shap values from the explainer
shap_values = explainer.shap_values(X_test)
Expected Value: [ 1.01750237 0.40935561 -0.02614841]
label_data.head()
| Route | Stage | Histologic diagnosis | Grade | Myometrial invasion depth | Tumor size (largest diameter) | Extrauterine involvement | Lymphovascular space invasion (LVSI) | Metastasis of pelvic lymph node | Metastasis of para-aortic lymph node | LN metastasis | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Laparoscopic | Ia | Endometrioid | I | Less than 50% | 2.5 | none | No | 0 | 0 | no |
| 1 | Laparoscopic | Ia | Endometrioid | I | Less than 50% | 1.0 | none | No | 0 | 0 | no |
| 2 | Laparoscopic | Ia | Endometrioid | I | Less than 50% | 3.8 | none | No | 0 | 0 | no |
| 3 | Laparoscopic | Ia | Endometrioid | I | Less than 50% | 4.0 | none | No | 0 | 0 | no |
| 4 | Laparoscopic | Ia | Endometrioid | I | Less than 50% | 2.0 | none | No | 0 | 0 | no |
-Red arrows는 예측값을 더 높게하는 변수들의 여향도를 설명 (SHAP values)
-Blue arrows는 반대로 지금 예측 값을 더 낮게하는 변수들의 영향도를 나타냄.
여기서 base value는 train set에서의 모델 평균 예측을 마킹한 것.
output value는 모델의 예측이다. 가장 큰 영향을 준 변의 값은 아래에 표시가 된다.
결국 forceplot은 예측에 대한 효과적인 요약을 제공한다.
# https://data-newbie.tistory.com/254
shap.force_plot(explainer.expected_value[0],
shap_values[0][4], X_test.iloc[4])
# https://data-newbie.tistory.com/254
shap.force_plot(explainer.expected_value[1],
shap_values[1][4], X_test.iloc[4])
# https://data-newbie.tistory.com/254
shap.force_plot(explainer.expected_value[2],
shap_values[2][4], X_test.iloc[4])
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
background = shap.maskers.Independent(X_train)
def f(x):
return shap.links.identity(model.predict_proba(x, validate_features=False)[:,0])
explainer = shap.Explainer(f, background, link=shap.links.logit)
shap_values = explainer(X_train)
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[4])
# plot the global importance of each feature
shap.plots.bar(shap_values[4])
# plot the distribution of importances for each feature over all samples
shap.summary_plot(shap_values)
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
background = shap.maskers.Independent(X_train)
def f(x):
return shap.links.identity(model.predict_proba(x, validate_features=False)[:,1])
explainer = shap.Explainer(f, background, link=shap.links.logit)
shap_values = explainer(X_train)
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[4])
# 전역 막대 플롯
shap.plots.bar(shap_values[4])
# plot the distribution of importances for each feature over all samples
shap.summary_plot(shap_values)
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
background = shap.maskers.Independent(X_train)
def f(x):
return shap.links.identity(model.predict_proba(x, validate_features=False)[:,2])
explainer = shap.Explainer(f, background, link=shap.links.logit)
shap_values = explainer(X_train)
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[4])
# plot the global importance of each feature
shap.plots.bar(shap_values[4])
# plot the distribution of importances for each feature over all samples
shap.summary_plot(shap_values)
asdasd
--------------------------------------------------------------------------- NameError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_27552/2657870816.py in <module> ----> 1 asdasd NameError: name 'asdasd' is not defined
# model building catboost
from catboost import CatBoostClassifier
model2 = CatBoostClassifier(iterations=107)
model2.fit(X_train, y_train)
# predict
y_pred = model2.predict(X_test)
# Print accuracy
from sklearn.metrics import accuracy_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Macro average of F1 score', f1_score(y_test, y_pred, average='macro'))
# print classification report
from sklearn.metrics import classification_report
print('Classification report\n',classification_report(y_test, y_pred))
# iteration parameter를 100~115로 해서 가장 높은 것을 찾음.
# Simple parameter tuning using loop
accuracy = []
for i in range(40, 80):
model2 = CatBoostClassifier(iterations=i)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
accuracy.append(accuracy_score(y_test, y_pred))
#ploting accuract graph
plt.plot(range(40, 80), accuracy)
plt.ylabel('Accuracy')
plt.xlabel('Range')
plt.show()
# model building catboost
from catboost import CatBoostClassifier
model2 = CatBoostClassifier(iterations=67)
model2.fit(X_train, y_train)
# predict
y_pred = model2.predict(X_test)
# Print accuracy
from sklearn.metrics import accuracy_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Macro average of F1 score', f1_score(y_test, y_pred, average='macro'))
# print classification report
from sklearn.metrics import classification_report
print('Classification report\n',classification_report(y_test, y_pred))
# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 5))
ax = sns.heatmap(cm, annot=True)
ax.set_xlabel('Predicted label')
ax.set_ylabel('True label')
ax.xaxis.set_ticklabels(['I', 'II', 'others'])
ax.yaxis.set_ticklabels(['I', 'II', 'others'])
# 세로 축이 True label
# 가로 축이 Predicted label
# Feature importance for xgboost
feat_importances = pd.Series(model2.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.xlabel('Relative Importances')
plt.ylabel('Features')
plt.title('Feature Importances')
plt.show()